city_temp_clean<- read.csv("city_temperature.csv")
# clean
city_temp_clean <- city_temp_clean %>%
filter(Year!=200, Year!=201, Day!=0 , Year!=2020, State!="Washington DC")
#Year 200, year 201, Day 0 are all bad data.
#Year 2000 is incomplete (only thru May)
city_temp_clean$AvgTemperature[city_temp_clean$AvgTemperature == -99] <- NA
#creating a parsedDate column in order to use liquidate operations
city_temp_clean$parsedDate <- ymd(paste0(city_temp_clean$Year,"-",city_temp_clean$Month,"-",city_temp_clean$Day))
#Add a Celsius column with one decimal
city_temp_clean$AvgTempCelsius <- ((city_temp_clean$AvgTemperature-32)*(5/9))
city_temp_clean$AvgTempCelsius <- sapply(city_temp_clean$AvgTempCelsius, round,digits=1)
#Creating my character vector of cities with complete count of records.
full_record_cities <- city_temp_clean %>%
group_by(City) %>%
summarise(Annual_Avg=mean(AvgTempCelsius, na.rm=TRUE), Record_count=n()) %>% filter(Record_count>=9131) %>% arrange(-Record_count)%>% dplyr::select(City)
#Filtering the dataframe by complete records:
city_temp_clean <- city_temp_clean %>%
filter(is.element(City, full_record_cities$City))
#Creating a character vector of Cities with fewer than 91 NAs (approximately 1% of the full record) , and filtering by that vector.
good_cities <- city_temp_clean %>%
group_by(City) %>%
summarise(nas_present = sum(is.na(AvgTempCelsius))) %>%
filter(nas_present<=91)%>%
dplyr::select(City)
city_temp_clean <- city_temp_clean %>%
filter(is.element(City, good_cities$City))
city_list <- city_temp_clean%>%
group_by(City)%>%
summarise(nas_present = sum(is.na(AvgTempCelsius)),record_count = n())%>%
arrange(-nas_present)%>%
unique()
#a list of duplicated date records
non_unique_dates <- city_temp_clean %>%
group_by(Region, State, City, Country, parsedDate) %>%
summarise(count_of_obs=n()) %>%
filter(count_of_obs>1)%>%
arrange(-count_of_obs)
summary_statistics_by_region <- city_temp_clean %>%
group_by(Region) %>%
dplyr::summarize(mean=mean(AvgTempCelsius, na.rm=TRUE), sd=sd(AvgTempCelsius,na.rm=TRUE),median=median(AvgTempCelsius, na.rm = TRUE), Record_count = n())%>%
as.data.frame()
summary_statistics_by_region <- rename(summary_statistics_by_region, "Mean Temperature (C)" = mean, "Median" = median, "Standard Deviation" = sd, "Number of Records" = Record_count)
summ_table_presentation <- summary_statistics_by_region%>%
kbl(caption = "Summary Statistics by Region", ) %>%
kable_styling( font_size = 16)%>%
kable_paper("hover", full_width = F)%>%
kable_material_dark()
summ_table_presentation
| Region | Mean Temperature (C) | Standard Deviation | Median | Number of Records |
|---|---|---|---|---|
| Africa | 19.88590 | 5.802537 | 20.1 | 54792 |
| Asia | 19.06741 | 11.727034 | 23.2 | 182640 |
| Australia/South Pacific | 16.46013 | 5.355089 | 16.4 | 45660 |
| Europe | 10.76569 | 8.230676 | 11.2 | 302376 |
| Middle East | 23.12594 | 9.564097 | 23.4 | 109583 |
| North America | 13.71096 | 10.592926 | 15.0 | 1415309 |
| South/Central America & Carribean | 17.27409 | 7.106319 | 16.9 | 54792 |
city_temp_clean%>%
group_by(Year)%>%
dplyr::summarize(
"Annual Average Temperature (C)"=mean(AvgTempCelsius, na.rm=TRUE),SD=sd(AvgTempCelsius, na.rm=TRUE)
)%>%
ggscatter(x = "Year", y = "Annual Average Temperature (C)",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "red",
fill = "lightblue"),
color = "Annual Average Temperature (C)",
cor.coef = TRUE, cor.method = "pearson",
xlab = "Year", ylab = "Temperature in Celsius")+
labs(title = "Linear Model of Annual Global Temperature", subtitle= "With Confidence Interval")+
theme_bw()
Mumbai <- city_temp_clean %>%
filter(City == "Bombay (Mumbai)")
1.Is there a significant positive correlation between time and temperature increase in Mumbai?
2.Is there a correlation between an increase in manufacturing growth (as indicated by US $) and temperature increase In Mumbai?
For the first research question, hypotheses testing is as follows: Ho: There is no relationship between time (years) and temperature. Ha: There is a positive linear relationship between time (years) and temperature.
Although annual seasonal fluctuations in temperature are a critical factor in analyzing temperature, Mumbai is relatively consistent compared to other regions of the world (https://en.climate-data.org/asia/india/maharashtra/mumbai-29/). After creating a new data frame, a new variable (average yearly temperatures) was run against temperature. The output examines the relationship between time (Indep var) on temperature (Dep var).
MumbaiYearandTemp <- Mumbai%>%
group_by(Year)%>%
dplyr::summarize("AvgTempCelsius"=mean(AvgTempCelsius, na.rm=TRUE))
summary(lm(AvgTempCelsius ~ Year, data = MumbaiYearandTemp))
##
## Call:
## lm(formula = AvgTempCelsius ~ Year, data = MumbaiYearandTemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70343 -0.19792 0.03796 0.26524 0.47350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -62.38500 18.30411 -3.408 0.00241 **
## Year 0.04496 0.00912 4.929 5.56e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3288 on 23 degrees of freedom
## Multiple R-squared: 0.5137, Adjusted R-squared: 0.4926
## F-statistic: 24.3 on 1 and 23 DF, p-value: 5.558e-05
For each year that goes by, there is a .045 increase in average temperature on average (C).The adjusted R-squared value is .493 which means this model accounts for roughly 50% of variation in temperature. The slope of the fit is positive and the p-value (at 0.00005558) means that this relationship is statistically significant. There is a positive linear relationship between time and temperature when time is measured in years. It is worth noting that at the annual level, this fit is based on just a couple of dozen datapoints. A longer temperature trend would be needed to really prove that this trend is steady through time. This model is only commenting on the trend during this particular time slot.
#linear model of time (yearly) with temp for Mumbai
MumbaiYearandTemp %>%
ggscatter(x = "Year", y = "AvgTempCelsius",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "red",
fill = "lightblue"),
cor.coef = TRUE, cor.method = "pearson",
xlab = "Year", ylab = "Temperature in Celsius")+
labs(title = "Linear Model of Temperatures in Mumbai", subtitle= "With Confidence Interval")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Manufacturing growth and temperature
For the second research question, hypotheses testing is as follows: Ho: There is no relationship between manufacturing growth (US $) and temperature. Ha: There is a positive relationship between manufacturing increase (US $) and temperature. The following chart examines the relationship between manufacturing growth (independent variable) and temperature (dependent variable).
#added a column of economic data from same original datasource (Manufacturing - in US dollars 1995-2019)
MumbaiYearandTemp['ManufactureGrowth'] <- c(85661962758, 93800942935, 93848956738, 96788159277, 102000000000, 109000000000, 112000000000,
120000000000, 127000000000, 137000000000, 149000000000, 176000000000, 188000000000, 197000000000, 219000000000, 235000000000, 243000000000,
256000000000, 269000000000, 290000000000, 328000000000, 354000000000,
380000000000, 401000000000, 391000000000)
summary(lm(AvgTempCelsius~ ManufactureGrowth, data = MumbaiYearandTemp))
##
## Call:
## lm(formula = AvgTempCelsius ~ ManufactureGrowth, data = MumbaiYearandTemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59609 -0.23756 0.05875 0.21587 0.44682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.712e+01 1.311e-01 206.835 < 2e-16 ***
## ManufactureGrowth 3.504e-12 5.704e-13 6.143 2.88e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2902 on 23 degrees of freedom
## Multiple R-squared: 0.6213, Adjusted R-squared: 0.6048
## F-statistic: 37.74 on 1 and 23 DF, p-value: 2.883e-06
#plotting line between Manuf Growth and Avg Temp
MumbaiYearandTemp %>%
ggscatter(x = "ManufactureGrowth", y = "AvgTempCelsius",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "red",
fill = "lightblue"),
color = "red",
cor.coef = TRUE, cor.method = "pearson",
xlab = "Manufacturing (Growth in US Dollars)", ylab = "Temperature (C)")+
labs(title = "Linear Regression between Manufacturing Growth and Avg. Temperature (C)", subtitle= "With Confidence Interval")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
### Limitations and analysis considerations
As discussed previously, these data are limited for various reasons and any conclusions drawn from modeling must consider the complexity of climate change. While manufacturing seems to play a role in the increase of temperature in Mumbai, understanding this more fully requires knowledge of historical and regional characteristics, accurate data on a range of industries, and the role of economic sectors to name a few. Other variables from the dataset would have been interesting to examine such as natural resource depletion, consumption and energy use. However, missing data for many years would create challenges.
library(pacman)
p_load(tidyverse)
p_load(VIM)
p_load(lubridate)
options(scipen = 999)
df_1 <- read_csv("data/PRSA_Data_Tiantan_20130301-20170228.csv")
df_2 <- read_csv("data/PRSA_Data_Shunyi_20130301-20170228.csv")
df_3 <- read_csv("data/PRSA_Data_Dingling_20130301-20170228.csv")
df_4 <- read_csv("data/PRSA_Data_Dongsi_20130301-20170228.csv")
For the data to be comparable with the temperature data set, the hourly measurements were converted into daily averages.
df <- bind_rows(df_1, df_2, df_3, df_4)
df
df <- df %>%
mutate(date = make_date(year, month, day))
df_avgs <- df %>%
group_by(date) %>%
summarise(
avg_co = mean(CO, na.rm = TRUE),
avg_so2 = mean(SO2, na.rm =TRUE),
avg_No2 = mean(NO2, na.rm =TRUE),
avg_O3 = mean(O3, na.rm =TRUE)
)
df_avgs
## # A tibble: 1,461 × 5
## date avg_co avg_so2 avg_No2 avg_O3
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2013-03-01 350 5.84 19.2 71.4
## 2 2013-03-02 924. 25.2 53.3 34.2
## 3 2013-03-03 1688. 41.4 73 25.7
## 4 2013-03-04 738. 13.8 38.8 61.9
## 5 2013-03-05 1970. 62.7 96.6 84.5
## 6 2013-03-06 2637. 89.2 120. 56.5
## 7 2013-03-07 3505. 75.8 132. 55.3
## 8 2013-03-08 2684. 54.0 102. 97.6
## 9 2013-03-09 1186. 26.1 48.8 91.3
## 10 2013-03-10 725. 18.8 40.5 82.0
## # … with 1,451 more rows
summary(df_avgs)
## date avg_co avg_so2 avg_No2
## Min. :2013-03-01 Min. : 194.8 Min. : 1.208 Min. : 8.594
## 1st Qu.:2014-03-01 1st Qu.: 610.1 1st Qu.: 4.396 1st Qu.: 29.323
## Median :2015-03-01 Median : 931.2 Median : 8.684 Median : 38.758
## Mean :2015-03-01 Mean :1179.6 Mean : 14.560 Mean : 44.562
## 3rd Qu.:2016-02-29 3rd Qu.:1420.8 3rd Qu.: 18.547 3rd Qu.: 54.208
## Max. :2017-02-28 Max. :7857.9 Max. :127.053 Max. :153.802
## avg_O3
## Min. : 2.189
## 1st Qu.: 29.918
## Median : 54.802
## Mean : 59.375
## 3rd Qu.: 82.776
## Max. :180.781
beijing <- city_temp_clean %>%
filter(City == "Beijing")
beijing <- beijing %>%
filter(parsedDate >= "2013-03-01")
beijing <- beijing %>%
filter(parsedDate <= "2017-02-28")
colnames(beijing)[9] <- "date"
total <- merge(beijing, df_avgs, by = "date")
total
To get a sense of the data, graphs measuring the daily average concentration of each of the pollutants on interest were generated, and can be seen below.
co <- total %>%
ggplot(aes(x = date)) +
geom_line(aes(y = avg_co), color = 'red') +
ggtitle("Average Carbon Monoxide Emissions Over Time") +
theme(plot.title = element_text(size=10))
so2 <- total %>%
ggplot(aes(x = date)) +
geom_line(aes(y = avg_so2), color = 'blue') +
ggtitle("Average Sulfur Dioxide Emissions Over Time")+
theme(plot.title = element_text(size=10))
no2 <- total %>%
ggplot(aes(x = date)) +
geom_line(aes(y = avg_No2), color = 'green') +
ggtitle("Average Nitrogen Dioxide Emissions Over Time")+
theme(plot.title = element_text(size=10))
o3 <- total %>%
ggplot(aes(x = date)) +
geom_line(aes(y = avg_O3), color = 'purple') +
ggtitle("Average Sulfur Dioxide Emissions Over Time")+
theme(plot.title = element_text(size=10))
grid.arrange(co, so2, no2, o3, ncol = 2)
model <- lm(AvgTemperature ~ avg_co + avg_so2 + avg_No2 + avg_O3, data = total)
coef(model)
## (Intercept) avg_co avg_so2 avg_No2 avg_O3
## 29.627620983 -0.002147985 -0.459513721 0.286439234 0.385010587
summary(model)
##
## Call:
## lm(formula = AvgTemperature ~ avg_co + avg_so2 + avg_No2 + avg_O3,
## data = total)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.074 -8.674 0.575 8.702 43.060
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.6276210 1.2936175 22.903 < 0.0000000000000002 ***
## avg_co -0.0021480 0.0007055 -3.044 0.00237 **
## avg_so2 -0.4595137 0.0286716 -16.027 < 0.0000000000000002 ***
## avg_No2 0.2864392 0.0309857 9.244 < 0.0000000000000002 ***
## avg_O3 0.3850106 0.0105364 36.541 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.82 on 1450 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.5822, Adjusted R-squared: 0.581
## F-statistic: 505.1 on 4 and 1450 DF, p-value: < 0.00000000000000022
model2 <- step(lm(AvgTemperature ~ avg_co + avg_so2 + avg_No2 + avg_O3, data = total), direction = "both")
## Start: AIC=7429.19
## AvgTemperature ~ avg_co + avg_so2 + avg_No2 + avg_O3
##
## Df Sum of Sq RSS AIC
## <none> 238437 7429.2
## - avg_co 1 1524 239961 7436.5
## - avg_No2 1 14052 252489 7510.5
## - avg_so2 1 42237 280674 7664.5
## - avg_O3 1 219566 458003 8377.0
summary(model2)
##
## Call:
## lm(formula = AvgTemperature ~ avg_co + avg_so2 + avg_No2 + avg_O3,
## data = total)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.074 -8.674 0.575 8.702 43.060
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.6276210 1.2936175 22.903 < 0.0000000000000002 ***
## avg_co -0.0021480 0.0007055 -3.044 0.00237 **
## avg_so2 -0.4595137 0.0286716 -16.027 < 0.0000000000000002 ***
## avg_No2 0.2864392 0.0309857 9.244 < 0.0000000000000002 ***
## avg_O3 0.3850106 0.0105364 36.541 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.82 on 1450 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.5822, Adjusted R-squared: 0.581
## F-statistic: 505.1 on 4 and 1450 DF, p-value: < 0.00000000000000022
map_to_season_north <- function(month_int){
months_df <- data.frame(month_num = c(1,2,3,4,5,6,7,8,9,10,11,12),
season=c('winter', 'winter', 'spring', 'spring', 'spring', 'summer', 'summer', 'summer', 'fall', 'fall', 'fall', 'winter'))
result <- months_df[months_df$month_num == month_int,]$season
return(result)
}
season <- unlist(lapply(month(total$Month), map_to_season_north))
total_w_season <- cbind(total, season)
total_w_season
model3 <- lm(AvgTemperature ~ avg_co + avg_so2 + avg_No2 + avg_O3 + as.factor(season), data = total_w_season)
summary(model3)
##
## Call:
## lm(formula = AvgTemperature ~ avg_co + avg_so2 + avg_No2 + avg_O3 +
## as.factor(season), data = total_w_season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.798 -4.849 0.434 5.556 24.026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.1958690 0.9399281 45.957 < 0.0000000000000002
## avg_co -0.0012077 0.0004892 -2.469 0.013672
## avg_so2 -0.1053013 0.0208949 -5.040 0.000000525406
## avg_No2 0.1340333 0.0215565 6.218 0.000000000658
## avg_O3 0.2076950 0.0089015 23.333 < 0.0000000000000002
## as.factor(season)spring -2.4660783 0.7191434 -3.429 0.000622
## as.factor(season)summer 13.8525639 0.7833078 17.685 < 0.0000000000000002
## as.factor(season)winter -21.8083026 0.7027941 -31.031 < 0.0000000000000002
##
## (Intercept) ***
## avg_co *
## avg_so2 ***
## avg_No2 ***
## avg_O3 ***
## as.factor(season)spring ***
## as.factor(season)summer ***
## as.factor(season)winter ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.332 on 1447 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.824, Adjusted R-squared: 0.8231
## F-statistic: 967.6 on 7 and 1447 DF, p-value: < 0.00000000000000022
city_temp_clean%>%
group_by(Year)%>%
filter(Country=="New Zealand")%>%
dplyr::summarize(
Annual_Avg=mean(AvgTempCelsius, na.rm=TRUE),SD=sd(AvgTempCelsius, na.rm=TRUE)) %>%
ggplot(aes(Year,Annual_Avg)) +
geom_line()+
geom_smooth(method='lm',se=F)+
#geom_abline(intercept=euro_intercept, slope=euro_slope, color='red', name="European Trend Line") +
xlim(1995, 2019)+
theme_bw()+
labs(y="Annual Average Temperature (Degrees Celsius)", title = "Average Temperature in NZ With Best Fit Line", subtitle = "1995-2019")
## Data Preparation
# new dataframe for New Zealand Data
nz_temp <- city_temp_clean%>%filter(Country=='New Zealand')
#this function takes month number as the input and assigns a categorical variable for season depending on the result. The output is tuned to the southern hemisphere. It will be used to add season information for the monthly record.
map_to_season_south <- function(month_int){
months_df <- data.frame(month_num = c(1,2,3,4,5,6,7,8,9,10,11,12),
season=c('summer', 'summer', 'fall', 'fall', 'fall', 'winter', 'winter', 'winter', 'spring', 'spring', 'spring', 'summer'))
result <- months_df[months_df$month_num == month_int,]$season
return(result)
}
map_to_season_south(1)
# assigning a dataframe of monthly temperatures, with season information and an additional date object that denotes unique month-year combo.
nz_monthly_temp <- nz_temp%>%
group_by(Year=year(parsedDate),Month=month(parsedDate)) %>%
summarise(AvgTempCelsius=mean(AvgTempCelsius, na.rm = TRUE))%>%
mutate(firstofmonth = ymd(paste0(Year,'-' , Month, '-1')))
nz_seasons <- unlist(lapply(month(nz_monthly_temp$firstofmonth), map_to_season_south))
nz_monthly_temp <- cbind(nz_monthly_temp, nz_seasons)
names(nz_monthly_temp) <- c('Year','Month','AvgTempCelsius','firstofmonth','season')
head(nz_monthly_temp)
## # A tibble: 6 × 5
## # Groups: Year [1]
## Year Month AvgTempCelsius firstofmonth season
## <dbl> <dbl> <dbl> <date> <chr>
## 1 1995 1 19.5 1995-01-01 summer
## 2 1995 2 20.1 1995-02-01 summer
## 3 1995 3 18.7 1995-03-01 fall
## 4 1995 4 17.9 1995-04-01 fall
## 5 1995 5 14.2 1995-05-01 fall
## 6 1995 6 11.9 1995-06-01 winter
#annual ice file
nz_ice<- read.csv("data/annual_glacier_volume.csv")
#expand to dataframe of monthly data, with columns for annual total as repeating values per year for monthly volume (/12).
nz_monthly_ice <- nz_ice%>%mutate(month = 1) %>%
complete(year, month = 1:12) %>%
fill(ice_volume_km.3)
#mutate_at(vars(ice_volume_km.3), ~./12)
#character vector of seasons
nz_seasons <- unlist(lapply(nz_monthly_ice$month, map_to_season_south))
#add character vector of seasons to monthly ice dataframe
nz_monthly_ice <- cbind(nz_monthly_ice, nz_seasons)
nz_monthly_ice <- nz_monthly_ice%>%mutate(firstofmonth = ymd(paste0(year,'-' , month, '-1')))
names(nz_monthly_ice) <- c("year", "month", "monthly_ice", "nz_seasons", "firstofmonth")
#dataframe of monthly seasonal multipliers for ice volume
temp_monthly = data.frame(list(1:12, c(.95, .95, .975, 1, 1.025, 1.05, 1.05, 1.025, 1, .975, .95, .925)))
names(temp_monthly)= c("month", "delta")
#joining to monthly ice record, and creating new monthly ice values using delta multipliers
nz_monthly_ice <- left_join(nz_monthly_ice, temp_monthly, by="month")
nz_monthly_ice <- nz_monthly_ice %>% mutate(monthly_ice = delta* monthly_ice+runif(300,-1,1))
# redefining to selected columns, converting to numeric to generate firstofmonth variable for join to temp data
nz_monthly_ice <- nz_monthly_ice %>% select(year,month,monthly_ice)
nz_monthly_ice$year <- as.numeric(nz_monthly_ice$year)
nz_monthly_ice$month <- as.numeric(nz_monthly_ice$month)
nz_monthly_ice$firstofmonth = ymd(paste0(nz_monthly_ice$year,'-' , nz_monthly_ice$month, '-1'))
#joining ice volume and temp tada
nz_monthly_temp <- left_join(nz_monthly_temp, nz_monthly_ice, by="firstofmonth")
nz_monthly_temp%>%select(firstofmonth,season,AvgTempCelsius,monthly_ice)
# set seed for random number generator to add noise to glacial data
set.seed(1000)
nz_monthly_temp$monthly_ice <- nz_monthly_temp$monthly_ice + runif(1, -1, 1)
nz_monthly_temp%>%
ggplot(aes(firstofmonth,monthly_ice)) +
geom_line()+
geom_smooth(method='lm',se=F)+
#geom_abline(intercept=euro_intercept, slope=euro_slope, color='red', name="European Trend Line") +
#xlim(1995, 2019)+
theme_bw()+
labs(y="Monthly Average Ice Volume (KM^3)",x="Month", title = "Monthly Ice in NZ With Best Fit Line", subtitle = "1995-2019")
ggscatter(nz_monthly_temp, x = "AvgTempCelsius", y = "monthly_ice",
color = "red", cor.coef = TRUE,
cor.method = "pearson",
xlab = "Temperature (C)", ylab = "Ice (km^3")
# monthly temperatures in NZ, adding a unique column "first of month" that denotes each unique month and year
nz_monthly_temp %>%
lm(AvgTempCelsius~firstofmonth, data= .) %>%
summary()
##
## Call:
## lm(formula = AvgTempCelsius ~ firstofmonth, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8506 -2.5745 -0.2731 3.0434 6.5285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.10342233 0.97309104 15.521 <0.0000000000000002 ***
## firstofmonth 0.00002892 0.00006984 0.414 0.679
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.189 on 298 degrees of freedom
## Multiple R-squared: 0.000575, Adjusted R-squared: -0.002779
## F-statistic: 0.1714 on 1 and 298 DF, p-value: 0.6791
nz_monthly_temp%>%
lm(AvgTempCelsius~firstofmonth, data= .) %>%
ggplot(aes(firstofmonth,AvgTempCelsius)) +
geom_line()+
geom_smooth(method='lm',se=F)+
#geom_abline(intercept=euro_intercept, slope=euro_slope, color='red', name="European Trend Line") +
#ylim(9, 16)+
theme_bw()+
labs(y="Average Temperature (Degrees Celsius)", x="Month", title = "Average Monthly Temperature in NZ With Best Fit Line", subtitle = "1995-2019")
## `geom_smooth()` using formula 'y ~ x'
The adjusted R-squared for a linear model of temperature ~ month across the record is 0.002779. This indicates essentially no relationship between temperature and time, with a p-value of 0.679. Below, the residual graphs for this fit (using the term loosely) are displayed.
nz_monthly_temp %>%
lm(AvgTempCelsius~firstofmonth, data= .) %>% plot()
restricted <- nz_monthly_temp %>%
lm(AvgTempCelsius~firstofmonth, data= .)
full <- nz_monthly_temp %>%
lm(AvgTempCelsius~firstofmonth+as.factor(season), data= .)
nz_monthly_temp %>%
lm(AvgTempCelsius~firstofmonth, data= .) %>%
summary()
##
## Call:
## lm(formula = AvgTempCelsius ~ firstofmonth, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8506 -2.5745 -0.2731 3.0434 6.5285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.10342233 0.97309104 15.521 <0.0000000000000002 ***
## firstofmonth 0.00002892 0.00006984 0.414 0.679
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.189 on 298 degrees of freedom
## Multiple R-squared: 0.000575, Adjusted R-squared: -0.002779
## F-statistic: 0.1714 on 1 and 298 DF, p-value: 0.6791
nz_monthly_temp %>%
lm(AvgTempCelsius~firstofmonth+as.factor(season), data= .) %>%
summary()
##
## Call:
## lm(formula = AvgTempCelsius ~ firstofmonth + as.factor(season),
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7181 -0.8873 -0.0427 0.9653 3.5317
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.88247064 0.45600982 34.829 < 0.0000000000000002
## firstofmonth 0.00004380 0.00003126 1.401 0.162
## as.factor(season)spring -1.96062316 0.23302587 -8.414 0.00000000000000173
## as.factor(season)summer 2.91907799 0.23295727 12.531 < 0.0000000000000002
## as.factor(season)winter -4.88876814 0.23297276 -20.984 < 0.0000000000000002
##
## (Intercept) ***
## firstofmonth
## as.factor(season)spring ***
## as.factor(season)summer ***
## as.factor(season)winter ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.427 on 295 degrees of freedom
## Multiple R-squared: 0.802, Adjusted R-squared: 0.7993
## F-statistic: 298.7 on 4 and 295 DF, p-value: < 0.00000000000000022
plot(full)
anova(restricted, full, test="LRT")
## Analysis of Variance Table
##
## Model 1: AvgTempCelsius ~ firstofmonth
## Model 2: AvgTempCelsius ~ firstofmonth + as.factor(season)
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 298 3030.12
## 2 295 600.34 3 2429.8 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Alboukadel Kassambara (2020). ggpubr: ‘ggplot2’ Based Publication Ready Plots. R package version 0.4.0. https://CRAN.R-project.org/package=ggpubr
Annual glacier ice volumes, 1977-2016 - Environmental Reporting | | GIS Map Data | MfE Data Service. (2017, October 16). New Zealand Ministry for the Environment. Retrieved May 8, 2022, from https://data.mfe.govt.nz/table/89472-annual-glacier-ice-volumes-19772016/
Brian G. Peterson and Peter Carl (2020). PerformanceAnalytics: Econometric Tools for Performance and Risk Analysis. R package version 2.0.4. https://CRAN.R-project.org/package=PerformanceAnalytics
Beijing Multi-Site Air-Quality Data Data Set. UCI Machine Learning Repository. (n.d.). Retrieved May 12, 2022, from https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data
Climate Mumbai India. (n.d.), Retrieved May 8, 2022, from https://en.climate-data.org/asia/india/maharashtra/mumbai-29/
Correlation matrix : A quick start guide to analyze, format and visualize a correlation matrix using R software - Easy Guides - Wiki - STHDA. (n.d.). Sthda.Com. Retrieved December 12, 2021, from http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software
Daily Temperature of Major Cities. (2022, April 22). Kaggle. https://www.kaggle.com/sudalairajkumar/daily-temperature-of-major-cities
French Press Agency (FPA). (2021, October 5). Global warming threatens big cities, study shows. https://www.dailysabah.com/life/environment/global-warming-threatens-big-cities-study-show
Greenstone, M., He, G., Li, S., & Zou, E. Y. (2021). China’s war on pollution: Evidence from the first 5 years. Review of Environmental Economics and Policy, 15(2), 281–299. https://doi.org/10.1086/715550
H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.
Hadley Wickham, Romain François, Lionel Henry and Kirill Müller (2021). dplyr: A Grammar of Data Manipulation. R package version 1.0.7. https://CRAN.R-project.org/package=dplyr
Hao Zhu (2021). kableExtra: Construct Complex Table with ‘kable’ and Pipe Syntax. R package version 1.3.4. https://CRAN.R-project.org/package=kableExtra
Keon-Woong Moon. R statistics and graphs for medical papers. Hannaare Seoul, 2015.
Keon-Woong Moon (2021). webr: Data and Functions for Web-Based Analysis. R package version 0.1.6. https://github.com/cardiomoon/webr
Lombrana, L.M. and S. Dodge. (2021, April 19). Whatever Climate Change does to the world, cities will be hit hardest. https://www.bloomberg.com/graphics/2021-cities-climate-victims/
Rinker, T. W. & Kurkiewicz, D. (2017). pacman: Package Management for R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
Roser, M. (2013). Economic Growth. Published online at OurWorldinData.org. Retrieved from: https://ourworldindata.org/economic-growth (online resource)
R Core Team. (2016). R: A Language and Environment for Statistical Computing. Vienna, Austria. Retrieved from https://www.R-project.org/
University Cooperation for Atmospheric Research. (UCAR). (n.d.), Urban Heat Islands. Retrieved May 6, 2022, from https://scied.ucar.edu/learning-zone/climate-change-impacts/urban-heat-islands
Urban population | Data. (n.d.). World Bank. Retrieved November 13, 2021, from https://data.worldbank.org/indicator/SP.URB.TOTL
Wickham et al., (2019). Welcome to the tidyverse. Journal of Open Source Software, 4(43), 1686, https://doi.org/10.21105/joss.01686
Yuan Tang, Masaaki Horikoshi, and Wenxuan Li. “ggfortify: Unified Interface to Visualize Statistical Result of Popular R Packages.” The R Journal 8.2 (2016): 478-489.
#lm for NZ ANNUAL temp
nz_annual_lm <- lm( nz_temp$AvgTempCelsius~nz_temp$parsedDate)
summary(nz_annual_lm)
##
## Call:
## lm(formula = nz_temp$AvgTempCelsius ~ nz_temp$parsedDate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4136 -2.6826 -0.1062 2.9014 8.6922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.09429858 0.19983710 75.533 <0.0000000000000002 ***
## nz_temp$parsedDate 0.00002758 0.00001433 1.925 0.0543 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.598 on 9071 degrees of freedom
## (59 observations deleted due to missingness)
## Multiple R-squared: 0.0004082, Adjusted R-squared: 0.000298
## F-statistic: 3.704 on 1 and 9071 DF, p-value: 0.0543
plot(nz_annual_lm)
Glacial
To display the two models side by side, the research group wrote a function to display each linear fit over the data with model descriptors included on the plot for ease of interpretation. The plots below appear to show two visually identical regressions - but the numeric characteristics of the models are vastly different. The top chart is the restricted model, and the bottom is the full model factoring seasonality.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_line() +
stat_smooth(method = "lm", col = "red") +
labs(x="Month",title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
"Intercept =",signif(fit$coef[[1]],5 ),
" Slope =",signif(fit$coef[[2]], 5),
" P =",signif(summary(fit)$coef[2,4], 5)))
}
grid.arrange(ggplotRegression(restricted), ggplotRegression(full) )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Convert ice and temp NZ data to a time series
time_series_temp <- ts(nz_monthly_temp%>%select(AvgTempCelsius),frequency = 12, start = c(1995,1))
## Adding missing grouping variables: `Year`
time_series_ice <- ts(nz_monthly_temp%>%select(monthly_ice),frequency = 12, start = c(1995,1))
## Adding missing grouping variables: `Year`
plot.ts(time_series_temp)
plot.ts(time_series_ice)
#the components for each time series include estimated effect of seasonality, error or "white noise" as well as the general trends. They can then be subtracted from the original data to account for individual trends. this is all estimate of course.
ts_temp_components <- decompose(time_series_temp)
ts_ice_components <- decompose(time_series_ice)
plot(decompose(time_series_temp))
plot(decompose(time_series_ice))
# removes estimated seasonality effect from ice data. This does smooth out some of the seasonal swings, probably a good thing!
ts_ice_seasonallyadjusted <- time_series_ice-ts_ice_components$seasonal
plot(time_series_ice)
plot(ts_ice_seasonallyadjusted) # removes estimated seasonality effect from ice data. This does smooth out some of the seasonal swings, probably a good thing!
acf(ts.intersect(time_series_ice,time_series_temp))